*===============================================================================
*
*					WORKER BELIEFS ABOUT OUTSIDE OPTIONS
*		(c)	Simon Jaeger, Christopher Roth, Nina Roussille, Benjamin Schoefer
*							  2023 December 5
*							   SOEP-IAB Data 
*
*===============================================================================


********************************************************************************
*								Data Preparation 							   *
********************************************************************************


cap log close                                        
log using ${log}/1_data_prep.log, replace            
set seed 6000

* Data preparation using SOEP-ADIAB


*** creating base IAB dataset

use ${orig}/SOEP-ADIAB_7519_v1.dta, clear
su * 

drop if begorig < td(01jan2000) // keep everything after 1 January 2000
keep if quelle == 1 | quelle ==2 // only look at sources BeH and LeH
drop estatvor estatnach art_kuend arbzeit traeger alo_beg alo_dau // drop variables that are not filled for sources BeH or LeH

gen int jahr = year(begepi) // year
label variable jahr "year"

* generate dummy variable indicating East (ost=1) or West Germany (ost=0)
gen byte ost = inrange(wo_bula, 11, 16) if !missing(wo_bula)
label variable ost "East Germany"                         
label define ost 0 "no" 1 "yes", replace                  
label values ost ost
tab ost 
drop wo_* // all other regional information is discarded

gen int age = jahr - gebjahr
label variable age "age" // age

compress // automatically assign optimal data types on all variables

save ${data}/data_prep_ADIAB.dta, replace // save data set in folder 'data'. Future program runs can use this file.

* for each person-year, keep one main spell (with highest compensation)
keep pid betnr jahr begepi endepi tentgelt

gen spell_earnings = (endepi-begepi+1)*tentgelt
drop if spell_earnings == . | spell_earnings<0
gen sort_spell_earnings = -spell_earnings
gen random_sort_var = runiform()	
bys pid jahr (sort_spell_earnings random_sort_var): gen priority_spell = _n

keep if priority_spell==1
keep pid betnr jahr tentgelt

save ${data}/data_prep_ADIAB_pid_betnr_jahr.dta, replace

log close
clear
